# params
CUTOFF <- 4 # numer of reviews cutoff
#STORE_TYPE <- "Casey's General Store"
#STORE_TYPE <- "SUBWAY®Restaurants"
Read in all reviews and merge in metadata
all.review.files <- list.files("../data/reviews/")
all.reviews = map(all.review.files, function(x) {
read_feather(paste0("../data/reviews/", x)) %>%
mutate(city = unlist(str_split(x, "_"))[1])}) %>%
bind_rows()
all.reviews.clean = all.reviews %>%
filter(!is.na(reviews)) %>%
filter(reviews != "") %>%
select(city, place.id, reviews, author.names) %>%
mutate_all(funs(as.factor))
all.place_id.files <- list.files("../data/place_ids/")
all.place_id.meta = map(all.place_id.files, function(x) {
read_feather(paste0("../data/place_ids/", x))}) %>%
bind_rows() %>%
rename(city = location)
all.place_id.meta.clean = all.place_id.meta %>%
select(place.id, city, types, name) %>%
mutate_all(funs(as.factor)) %>%
distinct(place.id, city, types, name) # some place ids are duplicated across cities
# merge on place id
d = left_join(all.reviews.clean, all.place_id.meta.clean) %>%
mutate(review_num = 1:nrow(.)) %>%
select(review_num, city, place.id, types, city, name, reviews, author.names)
#filter(name == STORE_TYPE)
There are 24469 in total.
Distribution of store types
all.place_id.meta.clean %>%
group_by(name, city) %>%
slice(1)%>%
group_by(name)%>%
summarize(n = n()) %>%
arrange(-n)
What does the distribution of cities look like across cities?
n.place.per.city = d %>%
group_by(city, name) %>%
slice(1) %>%
group_by(city) %>%
summarize(n = n())
ggplot(n.place.per.city, aes(x = n)) +
geom_histogram() +
theme_bw() +
ggtitle("# place with reviews per city")
# ggplot(n.place.per.city, aes(x = "", y = n)) +
# geom_boxplot() +
# theme_bw() +
# ggtitle("# place with reviews per city") +
# xlab("")
big.cities = n.place.per.city %>%
filter(n > CUTOFF)
There are 546 cities with at least 5 reviews.
big.cities = left_join(big.cities,
all.place_id.meta %>% select(city, lat, lon) %>%
group_by(city) %>% slice(1))
iowa_bounding_box = c(-96.6397171020508, 40.3755989074707, # southwest coordinates
-90.1400604248047, 43.5011367797852) # northeast coordinates
map <- get_stamenmap(iowa_bounding_box, zoom = 5, maptype = "toner-lite")
ggmap(map) +
geom_point(aes(x = lon, y = lat, size = log(n)),
data = big.cities, alpha = .5, color = "red") +
theme_bw()
Now we’re only going to include these cities.
d.big = d %>%
filter(city %in% unique(big.cities$city))
We calculate these measure on each review and then get city mean.
# reg expression for emojis
emj <- paste(unlist(trimws(emoji(list_emoji(), TRUE))), collapse = "|")
d.big = d.big %>%
mutate(reviews.clean = trimws(str_replace_all(reviews," "," ")),# remove extra spaces
reviews.clean = str_replace_all(reviews.clean, "http[[:alnum:]]*", ""), # remove urls
reviews.clean = str_replace_all(reviews.clean, "\n|\"", ""),
reviews.clean = str_replace_all(reviews.clean, emj, "EMOJI"),
reviews.clean = str_replace_all(reviews.clean, ":\\)|:\\(|:-\\)|:-\\(", "EMOJI")) # remove emjois
d.big = d.big %>%
rowwise() %>%
mutate(review.length = str_length(reviews.clean),
prop.cap = length(unlist(regmatches(reviews.clean, gregexpr("[A-Z]", reviews.clean, perl=TRUE))))/review.length,
prop.exclam = sum(gregexpr("[!]", reviews.clean)[[1]] > 0)/review.length,
prop.emoji = sum(gregexpr("EMOJI", reviews.clean)[[1]] > 0)/review.length)
long.d = d.big %>%
select(city, review.length, prop.cap, prop.exclam, prop.emoji) %>%
gather(measure, value, 2:5)
city.d = long.d %>%
group_by(city, measure) %>%
summarize(mean = mean(value, na.rm = T))
ggplot(city.d %>% filter(measure != "review.length"), aes(x = log(mean))) +
facet_wrap(~measure, scales = "free") +
geom_histogram(aes(fill = measure)) +
theme_bw() +
theme(legend.position = "none")
ggplot(city.d %>% filter(measure == "review.length"), aes(x = log(mean))) +
facet_wrap(~measure, scales = "free") +
geom_histogram(aes(fill = measure)) +
theme_bw() +
theme(legend.position = "none")
city.d.coord = inner_join(ungroup(city.d),
all.place_id.meta %>% select(city, lat, lon) %>% group_by(city) %>% slice(1))
ggmap(map) +
geom_point(aes(x = lon, y = lat, alpha = log(mean)),
data = filter(city.d.coord, measure == "review.length"),
color = "red") +
ggtitle("log length") +
theme_bw()
city.d.coord = city.d.coord %>%
mutate(log.prop.cap = ifelse(measure == "prop.cap",log(mean), mean ))
is.na(city.d.coord)<-sapply(city.d.coord, is.infinite)
ggmap(map) +
geom_point(aes(x = lon, y = lat, alpha = log.prop.cap),
data = filter(city.d.coord, measure == "prop.cap"),
color = "red") +
ggtitle("log prop capitalization") +
theme_bw()
ggmap(map) +
geom_point(aes(x = lon, y = lat, alpha = log(mean)),
data = filter(city.d.coord,
measure == "prop.exclam" & mean != 0) ,
color = "red") +
ggtitle("log prop exclam") +
theme_bw()
ggmap(map) +
geom_point(aes(x = lon, y = lat, alpha = log(mean)),
data = filter(city.d.coord,
measure == "prop.emoji" & mean != 0) ,
color = "red") +
ggtitle("log prop emoji") +
theme_bw()
Create corpus for each city
d.tokenized <- d.big %>%
ungroup() %>%
mutate(reviews = as.character(reviews)) %>%
unnest_tokens(word, reviews)
# remove stop words
data(stop_words)
d.tokenized.clean <- d.tokenized %>%
anti_join(stop_words)
counts = d.tokenized.clean %>%
count(word, sort = TRUE) %>%
arrange(n)
# remove infrequent words
infrequent.words = d.tokenized.clean %>%
count(word, sort = TRUE) %>%
filter(n < 10)
d.tokenized.clean = d.tokenized.clean %>%
anti_join(infrequent.words)
affin = d.tokenized.clean %>%
inner_join(get_sentiments("afinn")) %>%
group_by(city) %>%
summarize(mean = mean(score, na.rm = T)) %>%
arrange(mean)
# num words per
numwords = d.tokenized.clean %>%
group_by(city) %>%
summarize(n = n()) %>%
arrange(n)
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- d.tokenized %>%
group_by(review_num) %>%
summarize(num_words = n()) %>%
select(city, review_num, word)
prop.negative = d.tokenized %>%
semi_join(bingnegative) %>%
group_by(review_num) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = "review_num") %>%
mutate(prop.neg = negativewords/num_words) %>%
ungroup() %>%
select(review_num, prop.neg)
affin = d.tokenized.clean %>%
inner_join(get_sentiments("afinn")) %>%
group_by(city) %>%
summarize(affin.mean = mean(score, na.rm = T)) %>%
arrange(affin.mean)
affin.cord = inner_join(ungroup(affin),
all.place_id.meta %>% select(city, lat, lon) %>%
group_by(city) %>% slice(1))
ggplot(affin.cord, aes(x = affin.mean)) +
geom_histogram() +
theme_bw() +
ggtitle("Sentiment")
ggmap(map) +
geom_point(aes(x = lon, y = lat, alpha = affin.mean),
data = affin.cord,
color = "red") +
ggtitle("Sentiment") +
theme_bw()
city.corpus = d.tokenized.clean %>%
group_by(city) %>%
summarise(text=paste(word, collapse=" "))
docs = corpus(city.corpus,
doc_id = "city", text_field = "text")
dfm_docs = dfm(docs, groups = "city")
unigram.diversity = textstat_lexdiv(dfm_docs) %>%
rownames_to_column() %>%
rename(city = rowname)
Sampling
1:1000
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13
## [14] 14 15 16 17 18 19 20 21 22 23 24 25 26
## [27] 27 28 29 30 31 32 33 34 35 36 37 38 39
## [40] 40 41 42 43 44 45 46 47 48 49 50 51 52
## [53] 53 54 55 56 57 58 59 60 61 62 63 64 65
## [66] 66 67 68 69 70 71 72 73 74 75 76 77 78
## [79] 79 80 81 82 83 84 85 86 87 88 89 90 91
## [92] 92 93 94 95 96 97 98 99 100 101 102 103 104
## [105] 105 106 107 108 109 110 111 112 113 114 115 116 117
## [118] 118 119 120 121 122 123 124 125 126 127 128 129 130
## [131] 131 132 133 134 135 136 137 138 139 140 141 142 143
## [144] 144 145 146 147 148 149 150 151 152 153 154 155 156
## [157] 157 158 159 160 161 162 163 164 165 166 167 168 169
## [170] 170 171 172 173 174 175 176 177 178 179 180 181 182
## [183] 183 184 185 186 187 188 189 190 191 192 193 194 195
## [196] 196 197 198 199 200 201 202 203 204 205 206 207 208
## [209] 209 210 211 212 213 214 215 216 217 218 219 220 221
## [222] 222 223 224 225 226 227 228 229 230 231 232 233 234
## [235] 235 236 237 238 239 240 241 242 243 244 245 246 247
## [248] 248 249 250 251 252 253 254 255 256 257 258 259 260
## [261] 261 262 263 264 265 266 267 268 269 270 271 272 273
## [274] 274 275 276 277 278 279 280 281 282 283 284 285 286
## [287] 287 288 289 290 291 292 293 294 295 296 297 298 299
## [300] 300 301 302 303 304 305 306 307 308 309 310 311 312
## [313] 313 314 315 316 317 318 319 320 321 322 323 324 325
## [326] 326 327 328 329 330 331 332 333 334 335 336 337 338
## [339] 339 340 341 342 343 344 345 346 347 348 349 350 351
## [352] 352 353 354 355 356 357 358 359 360 361 362 363 364
## [365] 365 366 367 368 369 370 371 372 373 374 375 376 377
## [378] 378 379 380 381 382 383 384 385 386 387 388 389 390
## [391] 391 392 393 394 395 396 397 398 399 400 401 402 403
## [404] 404 405 406 407 408 409 410 411 412 413 414 415 416
## [417] 417 418 419 420 421 422 423 424 425 426 427 428 429
## [430] 430 431 432 433 434 435 436 437 438 439 440 441 442
## [443] 443 444 445 446 447 448 449 450 451 452 453 454 455
## [456] 456 457 458 459 460 461 462 463 464 465 466 467 468
## [469] 469 470 471 472 473 474 475 476 477 478 479 480 481
## [482] 482 483 484 485 486 487 488 489 490 491 492 493 494
## [495] 495 496 497 498 499 500 501 502 503 504 505 506 507
## [508] 508 509 510 511 512 513 514 515 516 517 518 519 520
## [521] 521 522 523 524 525 526 527 528 529 530 531 532 533
## [534] 534 535 536 537 538 539 540 541 542 543 544 545 546
## [547] 547 548 549 550 551 552 553 554 555 556 557 558 559
## [560] 560 561 562 563 564 565 566 567 568 569 570 571 572
## [573] 573 574 575 576 577 578 579 580 581 582 583 584 585
## [586] 586 587 588 589 590 591 592 593 594 595 596 597 598
## [599] 599 600 601 602 603 604 605 606 607 608 609 610 611
## [612] 612 613 614 615 616 617 618 619 620 621 622 623 624
## [625] 625 626 627 628 629 630 631 632 633 634 635 636 637
## [638] 638 639 640 641 642 643 644 645 646 647 648 649 650
## [651] 651 652 653 654 655 656 657 658 659 660 661 662 663
## [664] 664 665 666 667 668 669 670 671 672 673 674 675 676
## [677] 677 678 679 680 681 682 683 684 685 686 687 688 689
## [690] 690 691 692 693 694 695 696 697 698 699 700 701 702
## [703] 703 704 705 706 707 708 709 710 711 712 713 714 715
## [716] 716 717 718 719 720 721 722 723 724 725 726 727 728
## [729] 729 730 731 732 733 734 735 736 737 738 739 740 741
## [742] 742 743 744 745 746 747 748 749 750 751 752 753 754
## [755] 755 756 757 758 759 760 761 762 763 764 765 766 767
## [768] 768 769 770 771 772 773 774 775 776 777 778 779 780
## [781] 781 782 783 784 785 786 787 788 789 790 791 792 793
## [794] 794 795 796 797 798 799 800 801 802 803 804 805 806
## [807] 807 808 809 810 811 812 813 814 815 816 817 818 819
## [820] 820 821 822 823 824 825 826 827 828 829 830 831 832
## [833] 833 834 835 836 837 838 839 840 841 842 843 844 845
## [846] 846 847 848 849 850 851 852 853 854 855 856 857 858
## [859] 859 860 861 862 863 864 865 866 867 868 869 870 871
## [872] 872 873 874 875 876 877 878 879 880 881 882 883 884
## [885] 885 886 887 888 889 890 891 892 893 894 895 896 897
## [898] 898 899 900 901 902 903 904 905 906 907 908 909 910
## [911] 911 912 913 914 915 916 917 918 919 920 921 922 923
## [924] 924 925 926 927 928 929 930 931 932 933 934 935 936
## [937] 937 938 939 940 941 942 943 944 945 946 947 948 949
## [950] 950 951 952 953 954 955 956 957 958 959 960 961 962
## [963] 963 964 965 966 967 968 969 970 971 972 973 974 975
## [976] 976 977 978 979 980 981 982 983 984 985 986 987 988
## [989] 989 990 991 992 993 994 995 996 997 998 999 1000
num.words.per.city = d.tokenized.clean %>%
group_by(city) %>%
summarize(n = n()) %>%
arrange(-n)
## mapy do walmart?
unigram.diversity = left_join(unigram.diversity,
all.place_id.meta %>% select(city, lat, lon) %>% group_by(city) %>% slice(1))
ggmap(map) +
geom_point(aes(x = lon, y = lat, alpha = TTR),
data = unigram.diversity,
color = "red") +
ggtitle("TTR") +
theme_bw()
bigrams <- d.big %>%
select(city, review_num, reviews) %>%
group_by(city, reviews) %>%
unnest_tokens(bigram, reviews, token = "ngrams", n = 2)
unigrams <- d.big %>%
select(city, review_num, reviews) %>%
ungroup() %>%
mutate(reviews = as.character(reviews))%>%
unnest_tokens(unigram, reviews)
num_unigrams = unigrams %>%
ungroup() %>%
group_by(city, unigram) %>%
count(unigram, sort = TRUE) %>%
group_by(city) %>%
summarize(n.unigrams = n())
bigram.diversity = bigrams %>%
group_by(city) %>%
summarize(n.bigrams = n()) %>%
left_join(num_unigrams, by = "city") %>%
mutate(normalized.bigrams = n.bigrams/n.unigrams)
all.lang = city.d %>%
spread(measure, mean) %>%
left_join(unigram.diversity, by = "city") %>%
left_join(bigram.diversity, by = "city") %>%
left_join(affin, by = "city") %>%
mutate(log.review.length = log(review.length),
log.prop.cap = log(prop.cap),
log.prop.exclam = log(prop.exclam),
log.prop.emoji = log(prop.emoji)) %>%
select(-review.length, -prop.cap, -prop.exclam,
-prop.emoji, -U, -lgV0, -lgeV0)
is.na(all.lang)<-sapply(all.lang, is.infinite)
Are linguistic measure correlated with outcomes measures
dist = read_csv("../data/city_highway_distances.csv") %>%
rename(city = location) %>%
mutate(log.mindrive = log(mindrive),
log.time = log(time)) %>%
select(-mindrive, -mindist, -time)
all = all.lang %>%
left_join(dist) %>%
select(-lat, -lon)
correlate(all[,-1]) %>%
focus(TTR:log.time, mirror = TRUE) %>%
shave(upper = TRUE) %>%
rplot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
cor.test(all$log.time, all$TTR)
##
## Pearson's product-moment correlation
##
## data: all$log.time and all$TTR
## t = 7.729, df = 544, p-value = 5.268e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2368956 0.3882203
## sample estimates:
## cor
## 0.3145551
ggplot(all, aes(x = log.time, y = TTR)) +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
ggplot(all, aes(x = log.time, y = log.review.length)) +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
ggplot(all, aes(x = TTR, y = log.review.length)) +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
ggplot(all, aes(x = log.time, y = affin.mean)) +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
ggplot(all, aes(x = log.time, y = log.prop.emoji)) +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
Heteroscedasticity ##### TO DO : sample rows to get TTR; do subway# MTLD